{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ab6d6610",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json\n",
    "import pickle\n",
    "import random\n",
    "#from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c48fc87a",
   "metadata": {},
   "outputs": [],
   "source": [
    "speakers_floor_speeches_bids = open('../floor_speeches_congs_115_116/clean/author_map.txt').readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c9390839",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "513\n"
     ]
    }
   ],
   "source": [
    "speakers_floor_speeches_bids = list(map(lambda x:x.rstrip(), speakers_floor_speeches_bids))\n",
    "print(len(speakers_floor_speeches_bids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "96fc7341",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 3445918 entries, 0 to 3445917\n",
      "Data columns (total 4 columns):\n",
      " #   Column     Dtype \n",
      "---  ------     ----- \n",
      " 0   Tweet ID   int64 \n",
      " 1   Author ID  int64 \n",
      " 2   Text       object\n",
      " 3   Timestamp  object\n",
      "dtypes: int64(2), object(2)\n",
      "memory usage: 105.2+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "raw_data = pd.read_csv('all_tweets_df.csv')\n",
    "print(raw_data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3c6aa2b5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "829"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(raw_data['Author ID']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "0fdfa9d5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "530\n"
     ]
    }
   ],
   "source": [
    "bid_to_twitter_uid = pickle.load(open('social_media_data_bioguide_to_twitter.pkl', 'rb'))\n",
    "print(len(bid_to_twitter_uid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7a87b4ec",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "502\n"
     ]
    }
   ],
   "source": [
    "## first we remove all tweets from twitter authors for whom we are not calculating floor speech TBIPs\n",
    "relev_twitter_uids = []\n",
    "for bid in speakers_floor_speeches_bids:\n",
    "    if bid in bid_to_twitter_uid:\n",
    "        relev_twitter_uids.append(bid_to_twitter_uid[bid])\n",
    "relev_twitter_uids = sorted(set(relev_twitter_uids))\n",
    "print(len(relev_twitter_uids))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "2666008a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 2667283 entries, 4 to 3445678\n",
      "Data columns (total 4 columns):\n",
      " #   Column     Dtype \n",
      "---  ------     ----- \n",
      " 0   Tweet ID   int64 \n",
      " 1   Author ID  int64 \n",
      " 2   Text       object\n",
      " 3   Timestamp  object\n",
      "dtypes: int64(2), object(2)\n",
      "memory usage: 101.7+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "raw_data = raw_data[raw_data['Author ID'].isin(relev_twitter_uids)]\n",
    "print(raw_data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "02ffa1b2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "471"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(raw_data['Author ID']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "57b088bd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n"
     ]
    }
   ],
   "source": [
    "#remove speakers if they gave tweeted less than 100 times\n",
    "speakers_to_remove_based_on_num_tweets = set()\n",
    "speakers = set(raw_data['Author ID'])\n",
    "thresh = 100\n",
    "for s in speakers:\n",
    "    n_s = len(raw_data[raw_data['Author ID']==s])\n",
    "    if n_s < thresh:\n",
    "        speakers_to_remove_based_on_num_tweets.add(s)\n",
    "print(len(speakers_to_remove_based_on_num_tweets))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4a5e72db",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1456\n"
     ]
    }
   ],
   "source": [
    "# use stopwords used for speeches and also combine stopwords provided by TBIP authors that they used for senate tweets\n",
    "stopwords = open('../floor_speeches_congs_115_116/stopwords.txt').readlines()\n",
    "stopwords = list(map(lambda x:x.rstrip(), stopwords))\n",
    "print(len(stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "52598274",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "505\n"
     ]
    }
   ],
   "source": [
    "stopwords_sen_tweets = open('../../setup/stopwords/senate_tweets.txt').readlines()\n",
    "stopwords_sen_tweets = list(map(lambda x:x.rstrip(), stopwords_sen_tweets))\n",
    "print(len(stopwords_sen_tweets))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "0fbcc624",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1961\n"
     ]
    }
   ],
   "source": [
    "stopwords = stopwords + stopwords_sen_tweets\n",
    "print(len(stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "a0204343",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 435 entries, 0 to 434\n",
      "Data columns (total 3 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   Token   435 non-null    object\n",
      " 1   Uid     435 non-null    int64 \n",
      " 2   Link    435 non-null    object\n",
      "dtypes: int64(1), object(2)\n",
      "memory usage: 10.3+ KB\n",
      "None\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Token</th>\n",
       "      <th>Uid</th>\n",
       "      <th>Link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>KYComer</td>\n",
       "      <td>838462994</td>\n",
       "      <td>https://twitter.com/KYComer</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>RepJackyRosen</td>\n",
       "      <td>818554054309715969</td>\n",
       "      <td>https://twitter.com/RepJackyRosen</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>RepEspaillat</td>\n",
       "      <td>817076257770835968</td>\n",
       "      <td>https://twitter.com/RepEspaillat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>RepTrey</td>\n",
       "      <td>811986281177772032</td>\n",
       "      <td>https://twitter.com/RepTrey</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>RepDwightEvans</td>\n",
       "      <td>90639372</td>\n",
       "      <td>https://twitter.com/RepDwightEvans</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            Token                 Uid                                Link\n",
       "0         KYComer           838462994         https://twitter.com/KYComer\n",
       "1   RepJackyRosen  818554054309715969   https://twitter.com/RepJackyRosen\n",
       "2    RepEspaillat  817076257770835968    https://twitter.com/RepEspaillat\n",
       "3         RepTrey  811986281177772032         https://twitter.com/RepTrey\n",
       "4  RepDwightEvans            90639372  https://twitter.com/RepDwightEvans"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rep_accounts_115 = pd.read_csv('representatives-accounts-1.csv')\n",
    "print(rep_accounts_115.info())\n",
    "rep_accounts_115.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b20e323c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2396\n"
     ]
    }
   ],
   "source": [
    "stopwords = stopwords + list(map(lambda x:x.lower(), list(rep_accounts_115['Token'])))\n",
    "print(len(stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "49461bed",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 760 entries, 0 to 759\n",
      "Data columns (total 3 columns):\n",
      " #   Column  Non-Null Count  Dtype \n",
      "---  ------  --------------  ----- \n",
      " 0   Token   760 non-null    object\n",
      " 1   Uid     760 non-null    int64 \n",
      " 2   Link    760 non-null    object\n",
      "dtypes: int64(1), object(2)\n",
      "memory usage: 17.9+ KB\n",
      "None\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Token</th>\n",
       "      <th>Uid</th>\n",
       "      <th>Link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>RepDevinNunes</td>\n",
       "      <td>344972339</td>\n",
       "      <td>https://twitter.com/RepDevinNunes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AngieCraigMN</td>\n",
       "      <td>411861905</td>\n",
       "      <td>https://twitter.com/AngieCraigMN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>benraylujan</td>\n",
       "      <td>91125308</td>\n",
       "      <td>https://twitter.com/benraylujan</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>bradyfortexas</td>\n",
       "      <td>570005456</td>\n",
       "      <td>https://twitter.com/bradyfortexas</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>chiproytx</td>\n",
       "      <td>1257667158</td>\n",
       "      <td>https://twitter.com/chiproytx</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           Token         Uid                               Link\n",
       "0  RepDevinNunes   344972339  https://twitter.com/RepDevinNunes\n",
       "1   AngieCraigMN   411861905   https://twitter.com/AngieCraigMN\n",
       "2    benraylujan    91125308    https://twitter.com/benraylujan\n",
       "3  bradyfortexas   570005456  https://twitter.com/bradyfortexas\n",
       "4      chiproytx  1257667158      https://twitter.com/chiproytx"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rep_accounts_116 = pd.read_csv('congress116-house-accounts.csv')\n",
    "print(rep_accounts_116.info())\n",
    "rep_accounts_116.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "cd705680",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3156\n"
     ]
    }
   ],
   "source": [
    "stopwords = stopwords + list(map(lambda x:x.lower(), list(rep_accounts_116['Token'])))\n",
    "print(len(stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "47f0c0ae",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2460\n"
     ]
    }
   ],
   "source": [
    "stopwords = set(stopwords)\n",
    "print(len(stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "0df4c5ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "f = open('stopwords.txt', 'w')\n",
    "for i, x in enumerate(list(stopwords)):\n",
    "    f.write(x)\n",
    "    if i < len(stopwords) - 1:\n",
    "        f.write('\\n')\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "759936e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "#rest of the preprocessing is roughly follows the script provided in the TBIP repo - setup/senate_tweets_to_bag_of_words.py \n",
    "import os\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from scipy import sparse\n",
    "from sklearn.feature_extraction.text import CountVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "8aaea99d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2667283\n",
      "2667283\n"
     ]
    }
   ],
   "source": [
    "#below, speeches mean tweets\n",
    "speakers = list(raw_data['Author ID'])\n",
    "print(len(speakers))\n",
    "speeches = list(raw_data['Text'])\n",
    "print(len(speeches))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "6111e250",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2667283\n",
      "471\n"
     ]
    }
   ],
   "source": [
    "speaker_to_speaker_id = dict(\n",
    "    [(y, x) for x, y in enumerate(sorted(set(speakers)))])\n",
    "author_indices = np.array(\n",
    "    [speaker_to_speaker_id[s] for s in speakers])\n",
    "print(len(author_indices))\n",
    "author_map = np.array(list(speaker_to_speaker_id.keys()))\n",
    "print(len(author_map))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "327e6fb0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "300006\n"
     ]
    }
   ],
   "source": [
    "#data is too big as it stands, randomly sample tweets so as to preserve original distribution of tweets-per-author\n",
    "author_to_inds = {}\n",
    "\n",
    "sampled_N = 300000\n",
    "\n",
    "N = len(speeches)\n",
    "for i, a in enumerate(speakers):\n",
    "    if a in author_to_inds:\n",
    "        author_to_inds[a].append(i)\n",
    "    else:\n",
    "        author_to_inds[a] = [i]\n",
    "        \n",
    "selected_inds = []\n",
    "for a in author_to_inds:\n",
    "    l = author_to_inds[a]\n",
    "    random.Random(1).shuffle(l)\n",
    "    perc = round(sampled_N*(len(l)/N))\n",
    "    selected_inds = selected_inds + l[:perc]\n",
    "    \n",
    "print(len(selected_inds))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "5a7796c0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 300006 entries, 349114 to 3191035\n",
      "Data columns (total 4 columns):\n",
      " #   Column     Non-Null Count   Dtype \n",
      "---  ------     --------------   ----- \n",
      " 0   Tweet ID   300006 non-null  int64 \n",
      " 1   Author ID  300006 non-null  int64 \n",
      " 2   Text       300006 non-null  object\n",
      " 3   Timestamp  300006 non-null  object\n",
      "dtypes: int64(2), object(2)\n",
      "memory usage: 11.4+ MB\n",
      "None\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Tweet ID</th>\n",
       "      <th>Author ID</th>\n",
       "      <th>Text</th>\n",
       "      <th>Timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>349114</th>\n",
       "      <td>996837967976194048</td>\n",
       "      <td>818554054309715969</td>\n",
       "      <td>The skyrocketing cost of life-saving medicatio...</td>\n",
       "      <td>2018-05-16 19:41:06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55260</th>\n",
       "      <td>878358077145325569</td>\n",
       "      <td>818554054309715969</td>\n",
       "      <td>To all of the brave women who have had to say ...</td>\n",
       "      <td>2017-06-23 21:04:19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>152423</th>\n",
       "      <td>979935734701412353</td>\n",
       "      <td>818554054309715969</td>\n",
       "      <td>#Passover is a beautiful time to remember our ...</td>\n",
       "      <td>2018-03-31 04:17:40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109698</th>\n",
       "      <td>1040330596545904643</td>\n",
       "      <td>818554054309715969</td>\n",
       "      <td>I was glad to help pass bipartisan legislation...</td>\n",
       "      <td>2018-09-13 20:05:17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>162220</th>\n",
       "      <td>923648345394040833</td>\n",
       "      <td>818554054309715969</td>\n",
       "      <td>GOP budget does nothing to help Nevada's middl...</td>\n",
       "      <td>2017-10-26 20:31:40</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   Tweet ID           Author ID  \\\n",
       "349114   996837967976194048  818554054309715969   \n",
       "55260    878358077145325569  818554054309715969   \n",
       "152423   979935734701412353  818554054309715969   \n",
       "109698  1040330596545904643  818554054309715969   \n",
       "162220   923648345394040833  818554054309715969   \n",
       "\n",
       "                                                     Text            Timestamp  \n",
       "349114  The skyrocketing cost of life-saving medicatio...  2018-05-16 19:41:06  \n",
       "55260   To all of the brave women who have had to say ...  2017-06-23 21:04:19  \n",
       "152423  #Passover is a beautiful time to remember our ...  2018-03-31 04:17:40  \n",
       "109698  I was glad to help pass bipartisan legislation...  2018-09-13 20:05:17  \n",
       "162220  GOP budget does nothing to help Nevada's middl...  2017-10-26 20:31:40  "
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#selected_inds = sorted(selected_inds)\n",
    "raw_data = raw_data.iloc[selected_inds]\n",
    "print(raw_data.info())\n",
    "raw_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "b1a9659c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#raw_data.to_csv('sampled_twitter_data_raw.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "2c776b5b-b233-4efc-99c2-60469695ca9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data = pd.read_csv('sampled_twitter_data_raw.csv')#, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "39083a97",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "300006\n",
      "300006\n"
     ]
    }
   ],
   "source": [
    "#below, speeches mean tweets\n",
    "speakers = list(raw_data['Author ID'])\n",
    "print(len(speakers))\n",
    "speeches = list(raw_data['Text'])\n",
    "print(len(speeches))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "87df7a1a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "300006\n",
      "471\n"
     ]
    }
   ],
   "source": [
    "speaker_to_speaker_id = dict(\n",
    "    [(y, x) for x, y in enumerate(sorted(set(speakers)))])\n",
    "author_indices = np.array(\n",
    "    [speaker_to_speaker_id[s] for s in speakers])\n",
    "print(len(author_indices))\n",
    "author_map = np.array(list(speaker_to_speaker_id.keys()))\n",
    "print(len(author_map))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "7b870015-f958-4949-949d-29229d8f8673",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2537\n"
     ]
    }
   ],
   "source": [
    "stopwords = list(stopwords) + ['abby', 'allard', 'andr', 'andybiggs', 'anthonybrownmd', 'auctnr', 'austinscottga', 'balart', 'balderson', 'barrag', 'bettymccollum', 'clyburnsc', 'col', 'colon', 'conawaytx', 'congressnm', 'cortez', 'davejoyceoh', 'deb', 'delgadoforny', 'desjarlaistn', 'dr', 'drmarkgreen', 'drnealdunnfl', 'dutch', 'garc', 'gonz', 'guti', 'halleran', 'jackbergman', 'jacksonleetx', 'jasoncrowco', 'jes', 'jos', 'jr', 'katiehill', 'kellyforms', 'labrador', 'lacyclaymo', 'lamarsmithtx', 'lehtinen', 'lez', 'louiegohmerttx', 'luj', 'markamodeinv', 'maxrose', 'mikesimpson', 'mucarsel', 'nchez', 'ocasio', 'powell', 'ra', 'ratcliffetx', 'raulruiz', 'rdenas', 'rep', 'replancenj', 'ros', 'rourke', 'roybal', 'rrez', 'sfv', 'shea', 'spanbergerva', 'sr', 'steveknight', 'stevewomack', 'teamgt', 'tomcoleok', 'tomgravesga', 'troy', 'txrandy', 'vel', 'vickyh', 'wm', 'wny', 'zquez']\n",
    "print(len(stopwords))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "8ef6c030",
   "metadata": {},
   "outputs": [],
   "source": [
    "count_vectorizer = CountVectorizer(min_df=50,\n",
    "                                   max_df=0.5, \n",
    "                                   stop_words=stopwords, \n",
    "                                   ngram_range=(1, 3),\n",
    "                                   token_pattern=\"[a-zA-Z#]+\")\n",
    "# Learn initial document term matrix. This is only initial because we use it to\n",
    "# identify words to exclude based on author counts.\n",
    "counts = count_vectorizer.fit_transform(speeches)\n",
    "vocabulary = np.array(\n",
    "    [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), \n",
    "                            key=lambda kv: kv[1])])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "bead444b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(300006, 9408)\n",
      "9408\n"
     ]
    }
   ],
   "source": [
    "print(counts.shape)\n",
    "print(len(vocabulary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "cf3734aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "author_to_inds = {}\n",
    "for a in list(author_map):\n",
    "    inds = []\n",
    "    author_ind = speaker_to_speaker_id[a]\n",
    "    for i, ind in enumerate(list(author_indices)):\n",
    "        if ind==author_ind:\n",
    "            inds.append(i)\n",
    "    author_to_inds[a] = inds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "b0759eee",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_per_author_counts(counts, author_to_inds):\n",
    "    list_of_arrays = []\n",
    "    for a in author_to_inds:\n",
    "        inds = author_to_inds[a]\n",
    "        list_of_arrays.append(np.array(np.sum(counts[inds], 0)))#.reshape((1, counts.shape[1])))\n",
    "    return np.concatenate(list_of_arrays, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "eb8ed309",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(471, 9408)\n"
     ]
    }
   ],
   "source": [
    "counts_per_author = get_per_author_counts(counts, author_to_inds)\n",
    "print(counts_per_author.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "0c4729a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove phrases spoken by less than 5 representatives\n",
    "min_authors_per_word = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "27a18645",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9343\n"
     ]
    }
   ],
   "source": [
    "acceptable_words = []\n",
    "for i in range(len(vocabulary)):\n",
    "    if np.count_nonzero(counts_per_author[:, i]) >= min_authors_per_word:\n",
    "        acceptable_words.append(i)\n",
    "print(len(acceptable_words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "6d9cc9f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "count_vectorizer = CountVectorizer(ngram_range=(1, 3),\n",
    "                                   vocabulary=vocabulary[acceptable_words])\n",
    "counts = count_vectorizer.fit_transform(speeches)\n",
    "vocabulary = np.array(\n",
    "    [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), \n",
    "                            key=lambda kv: kv[1])])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "5eead8b5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(300006, 9343)\n",
      "9343\n"
     ]
    }
   ],
   "source": [
    "print(counts.shape)\n",
    "print(len(vocabulary))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "758c7d1b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# `n_gram_to_unigram` takes as key an index to an n-gram in the vocabulary\n",
    "# and its value is a list of the vocabulary indices of the corresponding \n",
    "# unigrams.\n",
    "n_gram_indices = np.where(\n",
    "  np.array([len(word.split(' ')) for word in vocabulary]) > 1)[0]\n",
    "n_gram_to_unigrams = {}\n",
    "for n_gram_index in n_gram_indices:\n",
    "    matching_unigrams = []\n",
    "    for unigram in vocabulary[n_gram_index].split(' '):\n",
    "        if unigram in vocabulary:\n",
    "            matching_unigrams.append(np.where(vocabulary == unigram)[0][0])\n",
    "    n_gram_to_unigrams[n_gram_index] = matching_unigrams\n",
    "\n",
    "# `n_grams_to_bigrams` now breaks apart trigrams and higher to find bigrams \n",
    "# as subsets of these words.\n",
    "n_grams_to_bigrams = {}\n",
    "for n_gram_index in n_gram_indices:\n",
    "    split_n_gram = vocabulary[n_gram_index].split(' ')\n",
    "    n_gram_length = len(split_n_gram) \n",
    "    if n_gram_length > 2:\n",
    "        bigram_matches = []\n",
    "        for i in range(0, n_gram_length - 1):\n",
    "            bigram = \" \".join(split_n_gram[i:(i + 2)])\n",
    "            if bigram in vocabulary:\n",
    "                bigram_matches.append(np.where(vocabulary == bigram)[0][0])\n",
    "        n_grams_to_bigrams[n_gram_index] = bigram_matches"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "9cc1771b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Go through counts, and remove a unigram each time a bigram superset \n",
    "# appears. Also remove a bigram each time a trigram superset appears.\n",
    "# Note this isn't perfect: if bigrams overlap (e.g. \"global health care\" \n",
    "# contains \"global health\" and \"health care\"), we count them both. This\n",
    "# may introduce a problem where we subract a unigram count twice, so we also\n",
    "# ensure non-negativity.\n",
    "#counts_dense = counts.toarray()\n",
    "for i in range(counts.shape[0]):\n",
    "    n_grams_in_doc = np.where(counts[i, n_gram_indices].toarray() > 0)[0]\n",
    "    sub_n_grams = n_gram_indices[n_grams_in_doc]\n",
    "    for n_gram in sub_n_grams:\n",
    "        counts[i, n_gram_to_unigrams[n_gram]] = sparse.csr_matrix(counts[i, n_gram_to_unigrams[n_gram]].toarray() - counts[i, n_gram])\n",
    "        if n_gram in n_grams_to_bigrams:\n",
    "            counts[i, n_grams_to_bigrams[n_gram]] = sparse.csr_matrix(counts[i, n_grams_to_bigrams[n_gram]].toarray() - counts[i, n_gram])\n",
    "counts[counts < 0] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "37d10700",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(300006, 9343)\n"
     ]
    }
   ],
   "source": [
    "print(counts.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "65ad00a8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(295327, 9343)\n",
      "(295327,)\n"
     ]
    }
   ],
   "source": [
    "# Remove speeches with no words.\n",
    "existing_speeches = []#np.where(np.sum(counts_dense, axis=1) > 0)[0]\n",
    "for i in range(counts.shape[0]):\n",
    "    if counts[i].sum() > 0:\n",
    "        existing_speeches.append(i)\n",
    "counts = counts[existing_speeches]\n",
    "print(counts.shape)\n",
    "author_indices = author_indices[existing_speeches]\n",
    "print(author_indices.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "90f7b5a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save data.\n",
    "\n",
    "# `counts.npz` is a [num_documents, num_words] sparse matrix containing the\n",
    "# word counts for each document.\n",
    "sparse.save_npz(\"clean2/counts.npz\",\n",
    "                counts.astype(np.float32))\n",
    "\n",
    "# `author_indices.npy` is a [num_documents] vector where each entry is an\n",
    "# integer indicating the author of the corresponding document.\n",
    "np.save(\"clean2/author_indices.npy\", author_indices)\n",
    "\n",
    "# `vocabulary.txt` is a [num_words] vector where each entry is a string\n",
    "# denoting the corresponding word in the vocabulary.\n",
    "np.savetxt(\"clean2/vocabulary.txt\", vocabulary, fmt=\"%s\")\n",
    "\n",
    "# `author_map.txt` is a [num_authors] vector of strings providing the bioguide ID of\n",
    "# each author in the corpus.\n",
    "np.savetxt(\"clean2/author_map.txt\", author_map, fmt=\"%s\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "5c336e25",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "295327\n"
     ]
    }
   ],
   "source": [
    "# `raw_documents.txt` contains all the documents we ended up using.\n",
    "raw_documents = [document.replace(\"\\n\", ' ').replace(\"\\r\", ' ') \n",
    "                 for i, document in enumerate(speeches) if i in existing_speeches]\n",
    "print(len(raw_documents))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "8a267f82",
   "metadata": {},
   "outputs": [],
   "source": [
    "f = open('clean2/raw_documents.txt', 'w')\n",
    "for i, doc in enumerate(raw_documents):\n",
    "    f.write(doc)\n",
    "    if i < len(raw_documents) - 1:\n",
    "        f.write('\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "19c68176",
   "metadata": {},
   "outputs": [],
   "source": [
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "f5cf2f7a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 295327 entries, 0 to 300005\n",
      "Data columns (total 4 columns):\n",
      " #   Column     Non-Null Count   Dtype \n",
      "---  ------     --------------   ----- \n",
      " 0   Tweet ID   295327 non-null  int64 \n",
      " 1   Author ID  295327 non-null  int64 \n",
      " 2   Text       295327 non-null  object\n",
      " 3   Timestamp  295327 non-null  object\n",
      "dtypes: int64(2), object(2)\n",
      "memory usage: 11.3+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "raw_data = raw_data.iloc[existing_speeches]\n",
    "print(raw_data.info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "50dc94b6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Tweet ID</th>\n",
       "      <th>Author ID</th>\n",
       "      <th>Text</th>\n",
       "      <th>Timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>996837967976194048</td>\n",
       "      <td>818554054309715969</td>\n",
       "      <td>The skyrocketing cost of life-saving medicatio...</td>\n",
       "      <td>2018-05-16 19:41:06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>878358077145325569</td>\n",
       "      <td>818554054309715969</td>\n",
       "      <td>To all of the brave women who have had to say ...</td>\n",
       "      <td>2017-06-23 21:04:19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>979935734701412353</td>\n",
       "      <td>818554054309715969</td>\n",
       "      <td>#Passover is a beautiful time to remember our ...</td>\n",
       "      <td>2018-03-31 04:17:40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1040330596545904643</td>\n",
       "      <td>818554054309715969</td>\n",
       "      <td>I was glad to help pass bipartisan legislation...</td>\n",
       "      <td>2018-09-13 20:05:17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>923648345394040833</td>\n",
       "      <td>818554054309715969</td>\n",
       "      <td>GOP budget does nothing to help Nevada's middl...</td>\n",
       "      <td>2017-10-26 20:31:40</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              Tweet ID           Author ID  \\\n",
       "0   996837967976194048  818554054309715969   \n",
       "1   878358077145325569  818554054309715969   \n",
       "2   979935734701412353  818554054309715969   \n",
       "3  1040330596545904643  818554054309715969   \n",
       "4   923648345394040833  818554054309715969   \n",
       "\n",
       "                                                Text            Timestamp  \n",
       "0  The skyrocketing cost of life-saving medicatio...  2018-05-16 19:41:06  \n",
       "1  To all of the brave women who have had to say ...  2017-06-23 21:04:19  \n",
       "2  #Passover is a beautiful time to remember our ...  2018-03-31 04:17:40  \n",
       "3  I was glad to help pass bipartisan legislation...  2018-09-13 20:05:17  \n",
       "4  GOP budget does nothing to help Nevada's middl...  2017-10-26 20:31:40  "
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "ca7d2401",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Tweet ID</th>\n",
       "      <th>Author ID</th>\n",
       "      <th>Text</th>\n",
       "      <th>Timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>300001</th>\n",
       "      <td>1202297800010141697</td>\n",
       "      <td>1176522535531360257</td>\n",
       "      <td>RT @RepMattGaetz: \"If you accept all of their ...</td>\n",
       "      <td>2019-12-04 18:45:07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>300002</th>\n",
       "      <td>1209902566969991168</td>\n",
       "      <td>1176522535531360257</td>\n",
       "      <td>Merry Christmas! 🎄 https://t.co/QsShG6aykE</td>\n",
       "      <td>2019-12-25 18:23:45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>300003</th>\n",
       "      <td>1243625787045150723</td>\n",
       "      <td>1176522535531360257</td>\n",
       "      <td>Been working hard w/ @Statedept to help get US...</td>\n",
       "      <td>2020-03-27 19:47:47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>300004</th>\n",
       "      <td>1247617377736548354</td>\n",
       "      <td>1176522535531360257</td>\n",
       "      <td>👏 @Allstate is REFUNDING $$$ hundreds of milli...</td>\n",
       "      <td>2020-04-07 20:08:57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>300005</th>\n",
       "      <td>1224891359359918082</td>\n",
       "      <td>1176522535531360257</td>\n",
       "      <td>This President, @realDonaldTrump stands for LI...</td>\n",
       "      <td>2020-02-05 03:03:52</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   Tweet ID            Author ID  \\\n",
       "300001  1202297800010141697  1176522535531360257   \n",
       "300002  1209902566969991168  1176522535531360257   \n",
       "300003  1243625787045150723  1176522535531360257   \n",
       "300004  1247617377736548354  1176522535531360257   \n",
       "300005  1224891359359918082  1176522535531360257   \n",
       "\n",
       "                                                     Text            Timestamp  \n",
       "300001  RT @RepMattGaetz: \"If you accept all of their ...  2019-12-04 18:45:07  \n",
       "300002         Merry Christmas! 🎄 https://t.co/QsShG6aykE  2019-12-25 18:23:45  \n",
       "300003  Been working hard w/ @Statedept to help get US...  2020-03-27 19:47:47  \n",
       "300004  👏 @Allstate is REFUNDING $$$ hundreds of milli...  2020-04-07 20:08:57  \n",
       "300005  This President, @realDonaldTrump stands for LI...  2020-02-05 03:03:52  "
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_data.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "ea26d299",
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data.to_csv('finalized_tbip_tweets_sampled2.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "f015c115",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done\n"
     ]
    }
   ],
   "source": [
    "print('Done')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "f8e17772-dc55-41d5-91f0-6658dbdcde94",
   "metadata": {},
   "outputs": [],
   "source": [
    "dict1 = {}\n",
    "  \n",
    "# creating dictionary\n",
    "with open('clean2/vocabulary.txt') as fh:\n",
    "  \n",
    "    for i, line in enumerate(fh):\n",
    "        dict1[line.strip()] = i\n",
    "out_file = open(\"clean2/vocabulary.json\", \"w\")\n",
    "json.dump(dict1, out_file)\n",
    "out_file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6c22c0f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pg36",
   "language": "python",
   "name": "pg36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
